This flow is based on: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5747346/. RRBS data are assumed to have two columns per sample, one with the “-Me” suffix and the other with the “-Un” suffix. Additional comments about RRBS data:

  1. We set dummy variables for each vial label
  2. Condition/group dummy variables have 1 only for the “Me” data
  3. We estimate the common dispersion parameter once on all the data
  4. Library sizes are computed by looking and Me and Un for each sample (keep.lib.sizes=FALSE causes the library sizes to be recomputed after applying a filter)

This paper uses glmFit-based pipeline, whereas others recommend using glmQFit as it is more accurate for controlling type 1 error - see comments in https://support.bioconductor.org/p/76790/#:~:text=glmQLFit%20uses%20the%20trended%20NB,a%20GLM%20%2D%20and%20that’s%20it

Specify parameters

# edgeR's tolerance parameter: default is 1e-06, but this seems to be too slow
edger_tol = 1e-05

# Specify the parameters required for the analysis
# This directory should have the MotrpacRatTraining6moQCRep repository
# clone from: https://github.com/MoTrPAC/MotrpacRatTraining6moQCRep
repo_local_dir = "~/Desktop/repos/MotrpacRatTraining6moQCRep"
# Runnable command for gsutil
gsutil_cmd = "~/google-cloud-sdk/bin/gsutil"
# Where should the data be downloaded to
local_data_dir = "ADDPATH"
# these paths were removed for security reasons, see the repo's readme
bucket = "ADDPATH"
pheno_bucket = "ADDPATH"
pheno_local_dir = "ADDPATH"

# we require a site to have a total count
# (both methylated and unmethylated) of at least min_count_coverage in 
# at least min_per_samples * (#samples); 
# specifying min_per_samples=1 means all samples
min_count_coverage = 10
min_per_samples = 1

# run_mode can be: promoters, windows, or sites
run_mode = "windows"
# parameters for defining promoters: 
# define region sizes around TSS per gene
promoter_coord = c(-1000,2000) # relevant only if run_mode=="promoters"
# size of tiles for the genome
wsize = 500 # relevant only if run_mode == "windows"

# this parameter specifies whether the DEA were computed 
# externally e.g., using a cluster
# in these cases, we simply load the results files names
# TISSUE_rrbs_dea_results.RData
# make sure that the correct results files exist in the working
# directory when using this option or the notebook may avoid
# loading the data and try to run DEA, which is very slow
load_dea_results_from_rdata = T

# This parameter specifies whether to process the raw
# data or load them from an external file called
# tissue2rrbs_processed_data.RData
# Note that if this file does not exist in the working
# directory then the notebook will continue to process the data, 
# which is very slow
load_processed_data_from_rdata = T

# Specify how many PCs to examine
num_pcs = 5
num_pcs_for_outlier_analysis = 3
# Specify the significance level for association analysis
# (e.g., between a PC and a clinical variable)
p_thr = 0.0001
OUTLIER_IQR_THR = 5
# Outliers added by MSSM
samples_to_remove = c("90266016903","90218016903","90578016903",
                      "90450015903","90585017004")
# outliers added by BIC (lung and hippo outliers, August 2021)
samples_to_remove = c(samples_to_remove,
                      c("90406015203","90251016604"))

# output path
# v2 means the results of a pipeline using
# run_mode = "windows", wsize=500,
# min_count_coverage = 10, min_per_samples = 1
# rho for clustering is the default of 0.7
data_out_bucket = "ADDPATH"
dea_out_bucket = "ADDPATH"
# Define technical and biological variables to be considered in the QC
pipeline_qc_cols = c("reads","pct_Unaligned","pct_GC","pct_chrM")
biospec_cols = c("registration.sex","key.anirandgroup",
                 "registration.batchnumber",
                 "training.staffid",
                  "is_control",
                  "vo2.max.test.vo2_max_visit1", # this assumes that visit1's are aligned
                  "terminal.weight.mg","time_to_freeze",
                  "timepoint","bid","pid")
# load functions and libraries
# load functions and libraries
source(paste0(repo_local_dir,"/tools/unsupervised_normalization_functions.R"))
source(paste0(repo_local_dir,"/tools/gcp_functions.R"))
source(paste0(repo_local_dir,"/tools/qc_report_aux_functions.R"))
source(paste0(repo_local_dir,"/tools/config_session.R"))
source(paste0(repo_local_dir,"/tools/association_analysis_methods.R"))

Get the Rat annotations:

# manage the gene annotation
# # Install the rat db
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("org.Rn.eg.db")
library("org.Rn.eg.db")
## Loading required package: AnnotationDbi
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
## 

Load the RRBS data from the bucket and filter low counts

This flow is based on: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5747346/ Note that RRBS data are assumed to have two columns per sample, one with the “-Me” suffix and the other with the “-Un” suffix.

First, download the data to local dir (skip this if already downloaded, these are large data tables).

obj = download_bucket_to_local_dir(bucket,local_path=local_data_dir,
            remove_prev_files = T,GSUTIL_PATH=gsutil_cmd)
# Second, delete bismark files
bis_files = obj$downloaded_files[
 grepl("bismark.cov.gz",obj$downloaded_files)
]
rem_obj = sapply(bis_files,file.remove)

Now load and filter the RRBS data:

obj = list(
  downloaded_files = list.files(local_data_dir,full.names = T,recursive = T)
)
obj$downloaded_files = obj$downloaded_files[
  grepl("epigen-rrbs",obj$downloaded_files)
  ]
rdata_files = obj$downloaded_files[
  grepl(".RData$",ignore.case = T,obj$downloaded_files)
  ]
names(rdata_files) = sapply(rdata_files,GET_get_tissue_from_download_path)
tissues = names(rdata_files)
tissue2rrbs_data = list()

#' Merge an edgeR DGEList object using a clustering of the sites
merge_sites_by_clusters<-function(yall,new_clusters){
    if (!"LocStart" %in% colnames(yall$genes)){
      cl_start = tapply(yall$genes$Locus,new_clusters,min)
    }
    else{
      cl_start = tapply(yall$genes$LocStart,new_clusters,min)
    }
    if (!"LocEnd" %in% colnames(yall$genes)){
      cl_end = tapply(yall$genes$Locus,new_clusters,max)
    }
    else{
      cl_end = tapply(yall$genes$LocEnd,new_clusters,max)
    }
    # assume that we do not merge sites across different chormosomes
    cl_chr = as.character(tapply(yall$genes$Chr,new_clusters,function(x)x[1]))
    cl_entrez = tapply(yall$genes$EntrezID,new_clusters,unique)
    cl_entrez = sapply(cl_entrez,paste,collapse=",")
    cl_symbol = tapply(yall$genes$Symbol,new_clusters,unique)
    cl_symbol = sapply(cl_symbol,paste,collapse=",")
    cl_strand = tapply(yall$genes$Strand,new_clusters,unique)
    cl_strand = sapply(cl_strand,paste,collapse=",")
    cl_loc = paste(cl_start,cl_end,sep="-")
    cl_sites = tapply(yall$genes$Locus,new_clusters,unique)
    cl_sites = sapply(cl_sites,paste,collapse=",")
    if (!"NumSites" %in% colnames(yall$genes)){
      cl_num_sites = tapply(yall$genes$Strand,new_clusters,length)
    }
    else{
      cl_num_sites = tapply(yall$genes$NumSites,new_clusters,sum)
    }
    # sanity checks: orders must be the same
    if(!(all(names(cl_end)==names(cl_start))) ||
       !(all(names(cl_end)==names(cl_symbol))) ||
       !(all(names(cl_end)==names(cl_entrez))) ||
       !(all(names(cl_end)==names(cl_strand)))
    ){
      print("Error in tapply by cluster name")
    }
    cluster_df = data.frame(
      Chr = cl_chr, 
      Locus = cl_loc,
      EntrezID = cl_entrez,
      Symbol = cl_symbol,
      Strand = cl_strand,
      Width = cl_end - cl_start,
      NumSites = cl_num_sites,
      Sites = cl_sites,
      LocStart = cl_start, LocEnd = cl_end
    )
    rownames(cluster_df) = names(cl_start)
    # sum counts in regions
    newy = rowsum(yall,new_clusters,reorder=T)
    # sanity check: symbols should be the same:
    if(!all(rownames(newy) == rownames(cluster_df))){print("Error after rowsum")}
    # update the gene annotation
    newy$genes = cluster_df[rownames(newy$counts),]
    # reorder
    o <- order(newy$genes$Chr, newy$genes$LocStart)
    newy <- newy[o,]
    return(newy)
}

library(MCL);library(corrplot)
# from: https://www.micans.org/mcl/intro.html
# MCL: Expansion coincides with taking the power of a stochastic matrix
# using the normal matrix product (i.e. matrix squaring). 
# Inflation corresponds with taking the Hadamard power of a matrix 
# (taking powers entrywise), followed by a scaling step, 
# such that the resulting matrix is stochastic again, 
# i.e. the matrix elements (on each column) correspond to probability values.
# Inlation: strengthen intra-region connections
analyze_tile<-function(tile_name,tile_l,M,min_cor=0.7,
                       inflations = c(3,2.5,2,1.5),plotcorr=F){
  #print(tile_name)
  tile_inds = tile_l[[tile_name]]
  if(length(tile_inds) < 2){
    v = tile_name
    names(v) = rownames(M)[tile_inds]
    return(v)
  }
  tile_M = M[tile_inds,]
  tile_corrs = cor(t(tile_M))
  if(plotcorr){
    corrplot(tile_corrs)
  }
  tile_clusters = NULL
  for(infl in inflations){
    tr = tryCatch({
      tile_clusters = mcl(tile_corrs > min_cor,addLoops = T,allow1=T,
                      inflation = infl)$Cluster
    }, error = function(x){}
    )
    if(!is.null(tile_clusters)){break}
  }
  if(is.null(tile_clusters)){
    print("Warning: MCL failed, using singletons instead")
    tile_clusters = 1:length(tile_inds)
  }
  
  # Solve mcl's bug of skipped cluster numbering
  tile_clusters = as.numeric(ordered(tile_clusters))
  tile_clusters = paste0(tile_name,"_cluster",tile_clusters)
  names(tile_clusters) = rownames(tile_M)
  return(tile_clusters)
}

# # check chr2, tile that contains 
# # or chr12 that contains Brca2
# analyze_tile("chr2-257637",tile_l,M,plotcorr = T,min_cor = 0.7)
# analyze_tile("chr12-658",tile_l,M,plotcorr = T,min_cor = 0.7)
# analyze_tile("chr1-213836",tile_l,M,plotcorr = T,min_cor = 0.7)
# analyze_tile(names(tile_l)[5],tile_l,M,plotcorr = T)

if(load_processed_data_from_rdata &&
   file.exists("tissue2rrbs_processed_data2.RData")){
  load("tissue2rrbs_processed_data2.RData")  
}else{
    for (tissue in tissues){
      curr_rdata_file = rdata_files[grepl(tissue,rdata_files)]
      if(length(curr_rdata_file)!=1){print(paste("Warning: more than a single RData file in tissue:",tissue))}
      yall = get(load(curr_rdata_file))
      
      # remove control samples
      is_sample = grepl("^9",colnames(yall),perl=T)
      yall = yall[,is_sample]
    
      # Filtering unassembled chromosomes
      keep <- rep(TRUE, nrow(yall))
      Chr <- as.character(yall$genes$Chr)
      keep[ grep("random",Chr) ] <- FALSE
      keep[ grep("chrUn",Chr) ] <- FALSE
      # remove non-chr ones
      keep[!grepl("chr",Chr) ] <- FALSE
      # remove M chromosome (otherwise we get error when assigning annotation below)
      keep[Chr=="chrM"] <- FALSE
      print(paste("Data from tissue",tissue,"was loaded into session"))
      print(paste("how many sites were removed (unassembled+chrM)?",sum(!keep)))
      # keep.lib.sizes=FALSE causes the library sizes to be recomputed
      yall <- yall[keep,, keep.lib.sizes=FALSE]
      # fix the levels and order by chromosome
      # rat genome chromosomes:
      ChrNames <- paste0("chr",c(1:20,"X","Y"))
      #ChrNames <- paste0("chr",c(1:2))
      yall$genes$Chr <- factor(yall$genes$Chr, levels=ChrNames)
      o <- order(yall$genes$Chr, yall$genes$Locus)
      yall <- yall[o,]
      print(paste("Counts matrix dim before low counts filter:"))
      print(dim(yall))
      gc()
      
      # Remove low count features
      # The analysis needs to be restricted to CpG sites that have enough coverage for the
      # methylation level to be measurable in a meaningful way at that site. 
      # As a conservative rule of thumb, we require a site to have a total count
      # (both methylated and unmethylated) of at least 8 in every sample.
      Coverage <- yall$counts[,grepl("-Me",colnames(yall$counts))] + 
        yall$counts[, grepl("-Un",colnames(yall$counts))]
      # see description of min_count_coverage and min_per_samples above
      keep <- rowSums(Coverage >= min_count_coverage) >= min_per_samples*ncol(Coverage)
      # filter the data
      yall <- yall[keep,, keep.lib.sizes=FALSE]
      print(paste("Counts matrix dim after low counts filter:"))
      print(dim(yall))
      
      # get locations, gene ids, etc.
      TSS <- nearestTSS(yall$genes$Chr, yall$genes$Locus, species="Rn")
      yall$genes$EntrezID <- TSS$gene_id
      yall$genes$Symbol <- TSS$symbol
      yall$genes$Strand <- TSS$strand
      yall$genes$Distance <- TSS$distance
      yall$genes$Width <- TSS$width
      
      if(run_mode == "promoters"){
          # Map data to promoters and sum over regions
          InPromoter <- yall$genes$Distance >= promoter_coord[1] & yall$genes$Distance <= promoter_coord[2]
          # We subset the CpGs to those contained in a promoter region:
          yIP <- yall[InPromoter,,keep.lib.sizes=FALSE]
          # Compute total counts
          ypr <- rowsum(yIP, yIP$genes$EntrezID, reorder=FALSE)
          yall = ypr
          gc()
      }
      
      if(run_mode == "windows"){
        # # a slow implementation of step 1 using external tools:
        # bsobj = BSseq(
        #   Cov = yall$counts[,grepl("-Me",colnames(yall$counts))] + 
        #     yall$counts[, grepl("-Un",colnames(yall$counts))],
        #     M = yall$counts[,grepl("-Me",colnames(yall$counts))],
        #     chr = as.character(yall$genes$Chr),
        #     pos = yall$genes$Locus
        # )
        # new_bs_obj = methylSig::tile_by_windows(bsobj, win_size = 500)
      
        # step 1: define the genome-level tiles
        chrs = as.character(yall$genes$Chr)
        pos = round(yall$genes$Locus/wsize)
        tiles = paste(chrs,pos,sep="-")
        names(tiles) = rownames(yall$counts)
        
        # step 2: add the M matrix
        Me <- yall$counts[, grepl("-Me",colnames(yall$counts))]
        Un <- yall$counts[, grepl("-Un",colnames(yall$counts))]
        M <- log2(Me + 2) - log2(Un + 2)
        
        # step 3: analyze each tile
        tile_l = split(1:nrow(M),tiles)
        tile_l = tile_l[unique(tiles)] # keep the genome order
        # single core
        #tile_clusters = lapply(names(tile_l),analyze_tile,tile_l=tile_l,M=M)
        # parallel comp
        tile_clusters = copy(tiles)
        for(chr in unique(chrs)){
          print(chr)
          curr_chr_inds = grepl(paste0(chr,"-"),tiles)
          chr_tiles = unique(tiles[curr_chr_inds])
          chr_tile_l = tile_l[chr_tiles]
          print(system.time({
            chr_tile_clusters = mclapply(chr_tiles,analyze_tile,
              tile_l = chr_tile_l,M = M, mc.cores = 5)
            chr_tile_clusters = unlist(chr_tile_clusters)
            tile_clusters[names(chr_tile_clusters)] = chr_tile_clusters
          }))
          
          gc()
        }
        yall = merge_sites_by_clusters(yall,tile_clusters)
      }
     
      # Data normalization
      # A key difference between BS-seq and other sequencing data is that the pair of libraries 
      # holding the methylated and unmethylated reads for a particular sample are treated as a unit.
      # To ensure that the methylated and unmethylated reads for the same sample are treated on the
      # same scale, we need to set the library sizes to be equal for each pair of libraries. 
      # We set the library sizes for each sample to be the average of the total read counts for 
      # the methylated and unmethylated libraries.
      # Other normalization methods developed for RNA-seq data, such as TMM, are not required for BS-seq data.
      TotalLibSize <- yall$samples$lib.size[grepl("-Me",colnames(yall$counts))] +
        +       yall$samples$lib.size[grepl("-Un",colnames(yall$counts))]
      yall$samples$lib.size <- rep(TotalLibSize, each=2)
      tissue2rrbs_data[[tissue]] = yall
      
      #  rm(yall);rm(y);gc()
  }
  
  # Get the pipeline qc scores
  pipeline_scores_file = obj$downloaded_files[
      grepl("qa-qc-metrics",obj$downloaded_files) & 
        grepl(".csv$",obj$downloaded_files)
    ]
  if(length(pipeline_scores_file)!=1){
    print("Error: RRBS data does not have a single pipeline scores file")
  }
  pipelineqc_scores = read.csv(pipeline_scores_file)
  rownames(pipelineqc_scores) = as.character(pipelineqc_scores$vial_label)
  save(tissue2rrbs_data,pipelineqc_scores,file="tissue2rrbs_processed_data2.RData")
}
## [1] "Data from tissue t52-hippocampus was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 72675"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6538607     100
## [1] "Counts matrix dim after low counts filter:"
## [1] 1323991     100
## [1] "chr1"
##    user  system elapsed 
##  61.297  12.450  17.269 
## [1] "chr2"
##    user  system elapsed 
##  31.038   6.116   8.753 
## [1] "chr3"
##    user  system elapsed 
##  38.044   6.675   9.537 
## [1] "chr4"
##    user  system elapsed 
##  26.764   5.680   7.461 
## [1] "chr5"
##    user  system elapsed 
##  35.319   6.448   8.971 
## [1] "chr6"
##    user  system elapsed 
##  28.727   6.835   7.787 
## [1] "chr7"
##    user  system elapsed 
##  33.012   6.297   8.645 
## [1] "chr8"
##    user  system elapsed 
##  28.967   6.262   8.451 
## [1] "chr9"
##    user  system elapsed 
##  23.300   5.562   6.518 
## [1] "chr10"
##    user  system elapsed 
##  37.685   6.484   9.514 
## [1] "chr11"
##    user  system elapsed 
##  13.498   4.452   4.406 
## [1] "chr12"
##    user  system elapsed 
##  18.401   4.824   5.417 
## [1] "chr13"
##    user  system elapsed 
##  14.638   4.344   4.287 
## [1] "chr14"
##    user  system elapsed 
##  18.806   5.504   5.698 
## [1] "chr15"
##    user  system elapsed 
##  14.906   4.504   4.565 
## [1] "chr16"
##    user  system elapsed 
##  15.381   4.594   4.772 
## [1] "chr17"
##    user  system elapsed 
##  16.705   4.843   4.867 
## [1] "chr18"
##    user  system elapsed 
##  14.419   4.471   4.435 
## [1] "chr19"
##    user  system elapsed 
##  15.640   4.670   4.474 
## [1] "chr20"
##    user  system elapsed 
##  15.109   4.433   4.551 
## [1] "chrX"
##    user  system elapsed 
##   3.069   1.300   1.124 
## [1] "chrY"
##    user  system elapsed 
##   0.239   0.157   0.291 
## [1] "Data from tissue t55-gastrocnemius was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 74600"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6869469     100
## [1] "Counts matrix dim after low counts filter:"
## [1] 1568128     100
## [1] "chr1"
##    user  system elapsed 
## 159.869 115.236  82.323 
## [1] "chr2"
##    user  system elapsed 
## 105.478  62.089  42.732 
## [1] "chr3"
##    user  system elapsed 
## 136.339  78.058  56.258 
## [1] "chr4"
##    user  system elapsed 
##  88.059  46.871  36.162 
## [1] "chr5"
##    user  system elapsed 
## 130.284  76.540  53.256 
## [1] "chr6"
##    user  system elapsed 
##  91.669  48.048  36.452 
## [1] "chr7"
##    user  system elapsed 
## 135.023  86.693  57.016 
## [1] "chr8"
##    user  system elapsed 
## 113.671  70.370  50.457 
## [1] "chr9"
##    user  system elapsed 
##  76.955  44.585  30.020 
## [1] "chr10"
##    user  system elapsed 
## 154.741  98.716  68.475 
## [1] "chr11"
##    user  system elapsed 
##  43.038  19.657  19.029 
## [1] "chr12"
##    user  system elapsed 
##  61.502  34.755  24.595 
## [1] "chr13"
##    user  system elapsed 
##  49.185  24.875  20.725 
## [1] "chr14"
##    user  system elapsed 
##  74.662  44.846  33.237 
## [1] "chr15"
##    user  system elapsed 
##  55.578  27.442  22.678 
## [1] "chr16"
##    user  system elapsed 
##  59.472  29.168  24.948 
## [1] "chr17"
##    user  system elapsed 
##  72.256  43.022  27.579 
## [1] "chr18"
##    user  system elapsed 
##  49.438  24.687  22.319 
## [1] "chr19"
##    user  system elapsed 
##  67.266  37.839  26.517 
## [1] "chr20"
##    user  system elapsed 
##  61.237  33.677  25.689 
## [1] "chrX"
##    user  system elapsed 
##   5.377   2.803   2.529 
## [1] "chrY"
##    user  system elapsed 
##   0.405   0.255   0.501 
## [1] "Data from tissue t58-heart was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 72607"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6597238     100
## [1] "Counts matrix dim after low counts filter:"
## [1] 1333039     100
## [1] "chr1"
##    user  system elapsed 
## 168.438 124.556  83.336 
## [1] "chr2"
##    user  system elapsed 
##  97.236  61.863  41.823 
## [1] "chr3"
##    user  system elapsed 
## 112.444  72.297  48.703 
## [1] "chr4"
##    user  system elapsed 
##  88.769  56.229  36.434 
## [1] "chr5"
##    user  system elapsed 
## 128.035  83.230  56.646 
## [1] "chr6"
##    user  system elapsed 
##  84.496  56.464  36.516 
## [1] "chr7"
##    user  system elapsed 
## 106.058  68.330  46.187 
## [1] "chr8"
##    user  system elapsed 
##  87.571  56.336  37.627 
## [1] "chr9"
##    user  system elapsed 
##  59.116  38.604  25.081 
## [1] "chr10"
##    user  system elapsed 
## 126.254  79.037  52.643 
## [1] "chr11"
##    user  system elapsed 
##  31.142  16.210  11.576 
## [1] "chr12"
##    user  system elapsed 
##  48.730  27.769  20.242 
## [1] "chr13"
##    user  system elapsed 
##  36.053  17.861  14.785 
## [1] "chr14"
##    user  system elapsed 
##  50.778  27.590  22.989 
## [1] "chr15"
##    user  system elapsed 
##  36.397  17.985  13.678 
## [1] "chr16"
##    user  system elapsed 
##  42.431  20.768  16.209 
## [1] "chr17"
##    user  system elapsed 
##  43.934  22.233  16.150 
## [1] "chr18"
##    user  system elapsed 
##  29.187  15.439  11.225 
## [1] "chr19"
##    user  system elapsed 
##  42.944  21.627  20.399 
## [1] "chr20"
##    user  system elapsed 
##  33.623  17.132  13.789 
## [1] "chrX"
##    user  system elapsed 
##   2.234   1.165   1.360 
## [1] "chrY"
##    user  system elapsed 
##   0.296   0.212   0.368 
## [1] "Data from tissue t59-kidney was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 73240"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6638813     100
## [1] "Counts matrix dim after low counts filter:"
## [1] 934525    100
## [1] "chr1"
##    user  system elapsed 
## 114.205  89.449  53.983 
## [1] "chr2"
##    user  system elapsed 
##  76.565  49.871  35.704 
## [1] "chr3"
##    user  system elapsed 
##  91.490  51.115  37.732 
## [1] "chr4"
##    user  system elapsed 
##  64.114  43.222  28.967 
## [1] "chr5"
##    user  system elapsed 
##  66.430  52.031  35.818 
## [1] "chr6"
##    user  system elapsed 
##  29.441  24.528  13.837 
## [1] "chr7"
##    user  system elapsed 
##  36.538  27.145  15.971 
## [1] "chr8"
##    user  system elapsed 
##  31.095  25.890  14.259 
## [1] "chr9"
##    user  system elapsed 
##  17.548   9.807   7.003 
## [1] "chr10"
##    user  system elapsed 
##  42.763  29.425  20.483 
## [1] "chr11"
##    user  system elapsed 
##   8.519   5.668   3.901 
## [1] "chr12"
##    user  system elapsed 
##  17.693  10.543   7.710 
## [1] "chr13"
##    user  system elapsed 
##  11.110   7.252   4.867 
## [1] "chr14"
##    user  system elapsed 
##  13.154   8.170   5.495 
## [1] "chr15"
##    user  system elapsed 
##  11.071   7.389   5.086 
## [1] "chr16"
##    user  system elapsed 
##  10.276   6.752   4.834 
## [1] "chr17"
##    user  system elapsed 
##  16.700   9.195   6.394 
## [1] "chr18"
##    user  system elapsed 
##   9.902   6.770   4.817 
## [1] "chr19"
##    user  system elapsed 
##  12.038   7.207   5.050 
## [1] "chr20"
##    user  system elapsed 
##  10.241   6.642   4.357 
## [1] "chrX"
##    user  system elapsed 
##   1.008   0.590   0.591 
## [1] "chrY"
##    user  system elapsed 
##   0.199   0.151   0.262 
## [1] "Data from tissue t66-lung was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 70433"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6437815     100
## [1] "Counts matrix dim after low counts filter:"
## [1] 1604405     100
## [1] "chr1"
##    user  system elapsed 
##  90.218  68.709  44.362 
## [1] "chr2"
##    user  system elapsed 
##  46.200  27.564  21.427 
## [1] "chr3"
##    user  system elapsed 
##  57.608  36.521  25.491 
## [1] "chr4"
##    user  system elapsed 
##  37.557  24.834  15.846 
## [1] "chr5"
##    user  system elapsed 
##  52.847  35.044  23.485 
## [1] "chr6"
##    user  system elapsed 
##  35.090  23.577  14.483 
## [1] "chr7"
##    user  system elapsed 
##  52.179  32.495  23.443 
## [1] "chr8"
##    user  system elapsed 
##  40.417  26.187  17.822 
## [1] "chr9"
##    user  system elapsed 
##  31.434  22.269  14.258 
## [1] "chr10"
##    user  system elapsed 
##  61.015  37.557  25.262 
## [1] "chr11"
##    user  system elapsed 
##  11.972   7.447   4.890 
## [1] "chr12"
##    user  system elapsed 
##  24.721  15.417  10.846 
## [1] "chr13"
##    user  system elapsed 
##  14.342   8.394   6.247 
## [1] "chr14"
##    user  system elapsed 
##  25.576  16.208  12.142 
## [1] "chr15"
##    user  system elapsed 
##  13.869   8.093   5.976 
## [1] "chr16"
##    user  system elapsed 
##  16.101   9.290   6.624 
## [1] "chr17"
##    user  system elapsed 
##  16.901   9.639   7.074 
## [1] "chr18"
##    user  system elapsed 
##  13.804   8.085   5.716 
## [1] "chr19"
##    user  system elapsed 
##  21.608  12.578  10.437 
## [1] "chr20"
##    user  system elapsed 
##  16.658   9.670   6.797 
## [1] "chrX"
##    user  system elapsed 
##   4.683   3.071   2.241 
## [1] "chrY"
##    user  system elapsed 
##   0.222   0.145   0.281 
## [1] "Data from tissue t68-liver was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 71495"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6499281     100
## [1] "Counts matrix dim after low counts filter:"
## [1] 1276062     100
## [1] "chr1"
##    user  system elapsed 
## 112.253  82.491  55.484 
## [1] "chr2"
##    user  system elapsed 
##  61.544  46.345  28.865 
## [1] "chr3"
##    user  system elapsed 
##  64.577  46.527  29.160 
## [1] "chr4"
##    user  system elapsed 
##  48.351  27.830  20.988 
## [1] "chr5"
##    user  system elapsed 
##  73.184  50.574  32.978 
## [1] "chr6"
##    user  system elapsed 
##  44.439  25.355  20.220 
## [1] "chr7"
##    user  system elapsed 
##  62.907  49.565  30.417 
## [1] "chr8"
##    user  system elapsed 
##  60.087  43.299  27.194 
## [1] "chr9"
##    user  system elapsed 
##  37.557  24.740  15.693 
## [1] "chr10"
##    user  system elapsed 
##  72.160  56.138  36.177 
## [1] "chr11"
##    user  system elapsed 
##  15.871   9.285   6.513 
## [1] "chr12"
##    user  system elapsed 
##  32.259  19.264  13.288 
## [1] "chr13"
##    user  system elapsed 
##  28.117  22.230  13.096 
## [1] "chr14"
##    user  system elapsed 
##  34.942  22.879  14.064 
## [1] "chr15"
##    user  system elapsed 
##  29.361  19.499  12.568 
## [1] "chr16"
##    user  system elapsed 
##  29.684  23.841  14.071 
## [1] "chr17"
##    user  system elapsed 
##  32.855  26.581  15.081 
## [1] "chr18"
##    user  system elapsed 
##  19.478  12.659  11.843 
## [1] "chr19"
##    user  system elapsed 
##  32.711  20.167  13.341 
## [1] "chr20"
##    user  system elapsed 
##  24.792  15.603  11.625 
## [1] "chrX"
##    user  system elapsed 
##   1.323   0.706   0.737 
## [1] "chrY"
##    user  system elapsed 
##   0.238   0.174   0.319 
## [1] "Data from tissue t69-brown-adipose was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 78814"
## [1] "Counts matrix dim before low counts filter:"
## [1] 7385104     100
## [1] "Counts matrix dim after low counts filter:"
## [1] 1707298     100
## [1] "chr1"
##    user  system elapsed 
## 146.433 104.623  71.231 
## [1] "chr2"
##    user  system elapsed 
##  73.117  57.159  38.342 
## [1] "chr3"
##    user  system elapsed 
##  98.804  76.151  46.993 
## [1] "chr4"
##    user  system elapsed 
##  62.275  45.774  28.306 
## [1] "chr5"
##    user  system elapsed 
##  88.351  59.566  40.224 
## [1] "chr6"
##    user  system elapsed 
##  61.816  45.729  28.919 
## [1] "chr7"
##    user  system elapsed 
##  93.378  67.834  43.285 
## [1] "chr8"
##    user  system elapsed 
##  66.476  50.325  30.992 
## [1] "chr9"
##    user  system elapsed 
##  47.529  31.222  22.580 
## [1] "chr10"
##    user  system elapsed 
##  99.767  72.041  47.263 
## [1] "chr11"
##    user  system elapsed 
##  28.212  20.234  13.736 
## [1] "chr12"
##    user  system elapsed 
##  38.613  25.132  16.759 
## [1] "chr13"
##    user  system elapsed 
##  32.448  25.065  15.049 
## [1] "chr14"
##    user  system elapsed 
##  42.239  31.720  19.301 
## [1] "chr15"
##    user  system elapsed 
##  32.710  25.866  16.142 
## [1] "chr16"
##    user  system elapsed 
##  32.812  25.984  14.967 
## [1] "chr17"
##    user  system elapsed 
##  36.685  29.337  16.745 
## [1] "chr18"
##    user  system elapsed 
##  32.599  26.164  15.364 
## [1] "chr19"
##    user  system elapsed 
##  38.858  30.109  18.269 
## [1] "chr20"
##    user  system elapsed 
##  33.306  26.034  15.106 
## [1] "chrX"
##    user  system elapsed 
##   3.698   2.451   2.232 
## [1] "chrY"
##    user  system elapsed 
##   0.310   0.198   0.397 
## [1] "Data from tissue t70-white-adipose was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 74742"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6962440     100
## [1] "Counts matrix dim after low counts filter:"
## [1] 1359690     100
## [1] "chr1"
##    user  system elapsed 
## 143.202 121.355  79.148 
## [1] "chr2"
##    user  system elapsed 
##  69.845  55.487  34.561 
## [1] "chr3"
##    user  system elapsed 
##  82.141  63.092  45.235 
## [1] "chr4"
##    user  system elapsed 
##  59.561  46.793  29.683 
## [1] "chr5"
##    user  system elapsed 
##  81.688  62.832  38.902 
## [1] "chr6"
##    user  system elapsed 
##  56.102  44.803  27.450 
## [1] "chr7"
##    user  system elapsed 
##  75.489  57.604  36.846 
## [1] "chr8"
##    user  system elapsed 
##  64.982  50.269  32.330 
## [1] "chr9"
##    user  system elapsed 
##  45.497  34.038  22.392 
## [1] "chr10"
##    user  system elapsed 
##  88.857  71.834  47.602 
## [1] "chr11"
##    user  system elapsed 
##  20.884  12.384  10.835 
## [1] "chr12"
##    user  system elapsed 
##  32.307  19.799  19.611 
## [1] "chr13"
##    user  system elapsed 
##  23.466  13.690  10.578 
## [1] "chr14"
##    user  system elapsed 
##  30.658  18.312  13.605 
## [1] "chr15"
##    user  system elapsed 
##  24.027  13.774  10.548 
## [1] "chr16"
##    user  system elapsed 
##  27.137  14.689  12.757 
## [1] "chr17"
##    user  system elapsed 
##  28.270  15.565  12.178 
## [1] "chr18"
##    user  system elapsed 
##  19.984  11.295   8.575 
## [1] "chr19"
##    user  system elapsed 
##  30.502  16.679  12.874 
## [1] "chr20"
##    user  system elapsed 
##  27.257  15.316  11.664 
## [1] "chrX"
##    user  system elapsed 
##   1.940   1.091   0.985 
## [1] "chrY"
##    user  system elapsed 
##   0.255   0.181   0.339

Add the phenotypic data

#Add the phenotypic data
pheno_data = parse_pheno_data(pheno_bucket,local_path = pheno_local_dir,
                              remove_prev_files = F,GSUTIL_PATH=gsutil_cmd)
# add a tissue variable (for convinience)
pheno_data$viallabel_data$tissue = 
  pheno_data$viallabel_data$specimen.processing.sampletypedescription
# add the time to freeze variable ((for convinience))
pheno_data$viallabel_data$time_to_freeze = 
  pheno_data$viallabel_data$calculated.variables.frozetime_after_train - 
  pheno_data$viallabel_data$calculated.variables.deathtime_after_train
pheno_data$viallabel_data$time_to_freeze = pmax(0,pheno_data$viallabel_data$time_to_freeze)
# add a binary is_control variable
pheno_data$viallabel_data$is_control = as.numeric(grepl("control",
  pheno_data$viallabel_data$key.anirandgroup,ignore.case = T))
# add the timepoint as a number
# x - the text description of the group
get_numeric_timepoint<-function(x){
  v = rep(0,length(x))
  tps = c("Eight"=8,"Four"=4,"One"=1,"Two"=2)
  for(tp in names(tps)){
    v[grepl(paste0(tp,"-week"),x,perl = T,ignore.case = T)] = tps[tp]
  }
  return(v)
}
pheno_data$viallabel_data$timepoint = get_numeric_timepoint(
  pheno_data$viallabel_data$key.anirandgroup
)

#sample_covs = pheno_data$viallabel_data[,biospec_cols]
pheno_data=pheno_data[[2]]
sample_covs = pheno_data[,biospec_cols]

Dataset preprocessing

We transform the normalized data into M values and quantile normalize the resulting matrix. These can be interpreted as the base2 logit transformation of the proportion of methylated signal at each locus.

# a helper function to run MDS on the top coefficient of variation features
top_var_plotMDS<-function(x,n=50000,samples_to_remove=NULL,...){
  n = min(n,nrow(x))
  cvs = apply(x,1,sd)/abs(rowMeans(x))
  cv_thr = sort(cvs,decreasing=T)[n]
  x = x[cvs>=cv_thr,]
  if(!is.null(samples_to_remove) && length(samples_to_remove)>0){
    x = x[,! colnames(x) %in% samples_to_remove]
  }
  plotMDS(x,...)
}
tissue2Mmatrix = list()
n_features_for_MDS = 5000
for(tissue in names(tissue2rrbs_data)){
  y = tissue2rrbs_data[[tissue]]
  Me <- y$counts[, grepl("-Me",colnames(y$counts))]
  Un <- y$counts[, grepl("-Un",colnames(y$counts))]
  # The M-value can be interpreted as the base2 logit transformation of the
  # proportion of methylated signal at each locus:
  M <- log2(Me + 2) - log2(Un + 2)
  colnames(M) <- gsub("-Me","",colnames(M))
  tissue2Mmatrix[[tissue]] = run_quantile_normalization(M)
  M = M[,!colnames(M) %in% samples_to_remove]
  
  # MDS plot
  # In this plot the distance between each pair of samples represents the average logit
  # change between the samples for the top most differentially methylated
  # CpG loci between that pair of samples.
  sex = sample_covs[colnames(M),1]
  sex[sex==1] = "M";sex[sex==2] = "F"
  M_group = paste(sex,
    pheno_data[colnames(M),"timepoint"],
    pheno_data[colnames(M),"is_control"],sep="_")
  cols = rainbow(length(unique(M_group)))
  names(cols) = unique(M_group)
  pchs = sample(1:22)[1:length(unique(M_group))]
  names(pchs) = names(cols)
  top_var_plotMDS(M, n = n_features_for_MDS,
          pch = pchs[M_group],col=cols[M_group],
          main = paste("MDS plot before quantile",tissue),lwd=2,cex=1)
  legend(names(cols),x="bottomright",ncol=4,cex=0.7,pch=pchs,col=cols)
  
  samp = sample(1:ncol(M))[1:20]
  samp2 = sample(1:nrow(M))[1:min(20000,nrow(M))]
  boxplot(M[samp2,samp],
          main=paste0(tissue,"\nsamples before quantile (",nrow(M)," features)"),
          ylab = "log2 M values",xaxt="n")
  
  M = run_quantile_normalization(M)
  top_var_plotMDS(M, n = n_features_for_MDS,
          pch = pchs[M_group],col=cols[M_group],
          main = paste("MDS plot after quantile",tissue),lwd=2,cex=1)
  legend(names(cols),x="bottomright",ncol=4,cex=0.7,pch=pchs,col=cols)
  
  # Compare with vs. without adjustment
  cvs = apply(M,1,sd)/abs(rowMeans(M))
  cv_thr = sort(cvs,decreasing=T)[n_features_for_MDS]
  M = M[cvs>=cv_thr,]
  curr_covs = data.frame(
    pct_unaligned_1 = poly(pipelineqc_scores[colnames(M),"pct_Unaligned"],1)[,1],
    pct_unaligned_2 = poly(pipelineqc_scores[colnames(M),"pct_Unaligned"],2)[,2]
  )
  newM = apply(M,1,
               function(x,covs){
                 df = data.frame(x=x,covs);
                 residuals(lm(x~pct_unaligned_1+pct_unaligned_2,data=df))},
               covs = curr_covs)
  newM = t(newM)
  top_var_plotMDS(newM, n = n_features_for_MDS,
          pch = pchs[M_group],col=cols[M_group],
          main = paste("MDS plot after quantile, after adj",tissue),lwd=2,cex=1)
  legend(names(cols),x="bottomright",ncol=4,cex=0.7,pch=pchs,col=cols)
  
  gc()
}

Examine the pipeline’s technical covariates

tech_var_results = matrix(-1,nrow = length(tissue2Mmatrix),ncol=length(pipeline_qc_cols))
colnames(tech_var_results) = pipeline_qc_cols
rownames(tech_var_results) = names(tissue2Mmatrix)
for(curr_tissue_name in names(tissue2Mmatrix)){
  currM = tissue2Mmatrix[[curr_tissue_name]]
  # get the metadata of the samples
  curr_meta = cbind(pipelineqc_scores[colnames(currM),pipeline_qc_cols],
               pheno_data[colnames(currM),biospec_cols])
  curr_meta$reads = log10(curr_meta$reads)
  curr_meta$sex = curr_meta$registration.sex
  curr_meta$group = paste(curr_meta$timepoint,curr_meta$is_control,sep="_")
  for(cov in pipeline_qc_cols){
    df = data.frame(x = curr_meta[[cov]],group=curr_meta$group,sex=curr_meta$sex)
    lm1 = lm(x~sex+group+sex:group,data=df)
    lm0 = lm(x~sex,data=df)
    an = anova(lm0,lm1)
    tech_var_results[curr_tissue_name,cov] = an[2,6]
  }
}
ggcorrplot(tech_var_results,p.mat = tech_var_results,legend.title = "P-value",sig.level = 0.01)

library(corrplot)
tech_var_results2 = pmin(tech_var_results,0.2)
corrplot(tech_var_results2,is.corr = F,
         p.mat = tech_var_results,sig.level = 0.01,cl.lim = c(0,0.2),cl.length = 21)

# Some plots of selected variables
tissue = "t55-gastrocnemius"
currM = tissue2Mmatrix[[tissue]]
curr_meta = cbind(pipelineqc_scores[colnames(currM),pipeline_qc_cols],
             pheno_data[colnames(currM),biospec_cols])
curr_meta$reads = log10(curr_meta$reads)
curr_meta$sex = curr_meta$registration.sex
curr_meta$sex[curr_meta$sex=="1"] = "F"
curr_meta$sex[curr_meta$sex=="2"] = "M"
curr_meta$timepoint[curr_meta$is_control>0] = "controls"
curr_meta$timepoint[!curr_meta$is_control] = paste0(" w",curr_meta$timepoint[!curr_meta$is_control])
curr_meta$group = curr_meta$timepoint
boxplot(reads~group:sex,data=curr_meta,las=2,xlab="",col="cyan",main=tissue)

boxplot(pct_chrM~group:sex,data=curr_meta,las=2,xlab="",col="cyan",main=tissue)

# Some plots of selected variables
tissue = "t68-liver" 
currM = tissue2Mmatrix[[tissue]]
curr_meta = cbind(pipelineqc_scores[colnames(currM),pipeline_qc_cols],
             pheno_data[colnames(currM),biospec_cols])
curr_meta$reads = log10(curr_meta$reads)
curr_meta$sex = curr_meta$registration.sex
curr_meta$sex[curr_meta$sex=="1"] = "F"
curr_meta$sex[curr_meta$sex=="2"] = "M"
curr_meta$timepoint[curr_meta$is_control>0] = "controls"
curr_meta$timepoint[!curr_meta$is_control] = paste0(" w",curr_meta$timepoint[!curr_meta$is_control])
curr_meta$group = curr_meta$timepoint
boxplot(pct_Unaligned~group:sex,data=curr_meta,las=2,xlab="",col="cyan",main=tissue)

# Some plots of selected variables
tissue = "t69-brown-adipose" 
currM = tissue2Mmatrix[[tissue]]
curr_meta = cbind(pipelineqc_scores[colnames(currM),pipeline_qc_cols],
             pheno_data[colnames(currM),biospec_cols])
curr_meta$reads = log10(curr_meta$reads)
curr_meta$sex = curr_meta$registration.sex
curr_meta$sex[curr_meta$sex=="1"] = "F"
curr_meta$sex[curr_meta$sex=="2"] = "M"
curr_meta$timepoint[curr_meta$is_control>0] = "controls"
curr_meta$timepoint[!curr_meta$is_control] = paste0(" w",curr_meta$timepoint[!curr_meta$is_control])
curr_meta$group = curr_meta$timepoint
boxplot(pct_chrM~group:sex,data=curr_meta,las=2,xlab="",col="cyan",main=tissue)

boxplot(pct_GC~group:sex,data=curr_meta,las=2,xlab="",col="cyan",main=tissue)

# Some plots of selected variables
tissue = "t58-heart" 
currM = tissue2Mmatrix[[tissue]]
curr_meta = cbind(pipelineqc_scores[colnames(currM),pipeline_qc_cols],
             pheno_data[colnames(currM),biospec_cols])
curr_meta$reads = log10(curr_meta$reads)
curr_meta$sex = curr_meta$registration.sex
curr_meta$sex[curr_meta$sex=="1"] = "F"
curr_meta$sex[curr_meta$sex=="2"] = "M"
curr_meta$timepoint[curr_meta$is_control>0] = "controls"
curr_meta$timepoint[!curr_meta$is_control] = paste0(" w",curr_meta$timepoint[!curr_meta$is_control])
curr_meta$group = curr_meta$timepoint
boxplot(pct_Unaligned~group:sex,data=curr_meta,las=2,xlab="",col="cyan",main=tissue)

MOP-flagged samples

The RRBS pipeline manual of procedures (MOP) defines a flagged sample (e.g., potentially problematic) as a sample that satisfies one of the following: (1) number of read pairs <20M, (2) Low number of uniquely mapped reads: pct_Uniq <50%, (3) mapped read count (pct_Uniq*reads) < 50% of the median mapped read count of all samples within a tissue (within a sequencing batch), (4) bisulfite conversion efficiency less than 95%, i.e lambda_pct_CpG>5%, but limited to samples with lambda_pct_Uniq>0.5%, (5)unexpected strands mapped: pct_OT<30% or pct_OB>70%; pct_CTOT>10% or pct_CTOB>10%, and (6) high duplications based on UMI: pct_umi_dup >20%.

pipelineqc_scores$pipeline_flags =  rrbs_pipeline_flagged_samples(pipelineqc_scores)
mop_flagged_samples = rownames(pipelineqc_scores)[nchar(pipelineqc_scores$pipeline_flags)>0]

PCA visualization (sex and group)

We plot the top two PCs for each tissue, color and shape correspond to randomization group and sex.

tissue2pca = list()
for(tissue in names(tissue2Mmatrix)){
  curr_data = tissue2Mmatrix[[tissue]]
  # remove vial labels that start with 8 (reference samples)
  curr_data = curr_data[,grepl("^9",colnames(curr_data),perl = T)]
  # remove zero variance rows
  curr_data = curr_data[apply(curr_data,1,sd)>0,]
  # remove identified outliers
  curr_data = curr_data[,!colnames(curr_data) %in% samples_to_remove]
    
  curr_pca = prcomp(t(curr_data),scale. = T,center = T,rank. = num_pcs)
  curr_pcax = curr_pca$x[,1:num_pcs]
  explained_var = summary(curr_pca)[["importance"]][3,5]

  # plot
  df = data.frame(curr_pcax,
    randgroup = pheno_data[rownames(curr_pcax),"key.anirandgroup"],
    sex = as.character(pheno_data[rownames(curr_pcax),"registration.sex"]),
    stringsAsFactors = F
  )
  df$sex[df$sex=="1"] = "F"
  df$sex[df$sex=="2"] = "M"
  p = ggplot(df) + 
      geom_point(aes(x=PC1, y=PC2,col=randgroup, shape=sex)) +
      ggtitle(tissue)
  plot(p)
  tissue2pca[[tissue]] = df
  gc()
}

PC-based association analysis

Here we analyze the top principal components (5) in each tissue and compute their association with the selected variables above (e.g., the pipeline qc metrics). We use Spearman correlation and a linear test for significance.

For each dataset we also add the correlations among the metadata variables. These two analyses should be interpreted together, as the analyzed variables are not independent.

pcs_vs_qc_var_report = c()
metadata_variable_assoc_report = c()
for(currname in names(tissue2Mmatrix)){
  curr_pcax = tissue2pca[[currname]]
  curr_pcax = curr_pcax[,grepl("^PC",colnames(curr_pcax))]
  # get the metadata of the samples
  curr_meta = cbind(pipelineqc_scores[rownames(curr_pcax),pipeline_qc_cols],
                     pheno_data[rownames(curr_pcax),biospec_cols])
  # remove metadata variables with NAs
  curr_meta = curr_meta[,!apply(is.na(curr_meta),2,any)]
  # remove bid and pid
  curr_meta = curr_meta[,!grepl("pid|bid",names(curr_meta))]
  # remove the text group description
  curr_meta = curr_meta[!grepl("randgr",names(curr_meta))]
  # remove fields withe zero variance
  curr_meta = curr_meta[,apply(curr_meta,2,sd)>0]
  
  # Assumption here: all metadata values are numeric
  corrs = cor(curr_pcax,curr_meta,method="spearman")
  corrsp = pairwise_eval(
    curr_pcax[rownames(curr_meta),],
    curr_meta,func=pairwise_association_wrapper,
    f=1)
  
  # Some ggplots have to be printed to be shown in the notebook
  print(ggcorrplot(t(corrs),lab=T,lab_size=2.5,hc.order = F) +
    ggtitle(currname) +
    theme(plot.title = element_text(hjust = 0.5,size=20)))
  
  for(i in 1:nrow(corrsp)){
    for(j in 1:ncol(corrsp)){
      if(corrsp[i,j]>p_thr){next}
      pcs_vs_qc_var_report = rbind(pcs_vs_qc_var_report,
            c(currname,
              rownames(corrsp)[i],colnames(corrsp)[j],corrs[i,j],corrsp[i,j])
            )
    }
  }
  colnames(pcs_vs_qc_var_report) = c("Dataset(tissue,site)","PC",
                                  "qc_metric","rho(spearman)","p-value")
  
  ####
  # compute correlations among the metadata variables
  ####
  corrs = cor(curr_meta,method="spearman")
  corrsp = pairwise_eval(
    curr_meta,func=pairwise_association_wrapper,
    f=1)
  
  # Some ggplots have to be printed to be shown in the notebook
  print(ggcorrplot(corrs,lab=T,lab_size=1.5,hc.order = T))
  
  for(n1 in rownames(corrsp)){
    for(n2 in rownames(corrsp)){
      if(n1==n2){break}
      if(n1 %in% biospec_cols &&
         n2 %in% biospec_cols) {next}
      if(corrsp[n1,n2]>p_thr){next}
      metadata_variable_assoc_report = rbind(metadata_variable_assoc_report,
            c(currname,n1,n2,corrs[n1,n2],corrsp[n1,n2])
            )
    }
  }
  if(!is.null(dim(metadata_variable_assoc_report))){
    colnames(metadata_variable_assoc_report) = c(
      "Dataset(tissue,site)","Var1","Var2","rho(spearman)","p-value")
  }
}

# Format the reports - for a nicer presentation in a table
pcs_vs_qc_var_report[,5] = format(
  as.numeric(pcs_vs_qc_var_report[,5]),digits=3)
pcs_vs_qc_var_report[,4] = format(
  as.numeric(pcs_vs_qc_var_report[,4]),digits=3)
metadata_variable_assoc_report[,5] = format(
  as.numeric(metadata_variable_assoc_report[,5]),digits=3)
metadata_variable_assoc_report[,4] = format(
  as.numeric(metadata_variable_assoc_report[,4]),digits=3)

PCA outliers

In this analysis, outliers are flagged by examining the boxplot of each PC, extending its whiskers to three times the inter-quantile range away from the boxplot. Samples outside this range are then flagged.

Note that samples are flagged using an automatic analysis of the principal components. Such analyses may flag samples because of true biological effects and thus further examination is required before determining if flagged samples represent low quality samples.

pca_outliers_report = c()
for(currname in names(tissue2pca)){
  curr_pcax = tissue2pca[[currname]]
  curr_pcax = curr_pcax[,grepl("^PC",colnames(curr_pcax))]
  
  # Univariate: use IQRs
  pca_outliers = c()
  for(j in 1:ncol(curr_pcax)){
    outlier_values <- boxplot.stats(curr_pcax[,j],coef = OUTLIER_IQR_THR)$out
    for(outlier in names(outlier_values)){
      pca_outliers_report = rbind(pca_outliers_report,
       c(currname,paste("PC",j,sep=""),outlier,
         format(outlier_values[outlier],digits=5))
      )
      if(!is.element(outlier,names(pca_outliers))){
        pca_outliers[outlier] = outlier_values[outlier]
      }
    }
  }
  
  # Plot the outliers
  if(length(pca_outliers)>0){
    # print(length(kNN_outliers))
    df = data.frame(curr_pcax,
                outliers = rownames(curr_pcax) %in% names(pca_outliers))
    col = rep("black",nrow(df))
    col[df$outliers] = "green"
    plot(df$PC1,df$PC2,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
         main = paste(currname,"flagged outliers"),
         xlab = "PC1",ylab="PC2")
    plot(df$PC3,df$PC4,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
         main = paste(currname,"flagged outliers"),
         xlab = "PC3",ylab="PC4")
  }
}
if(!is.null(dim(pca_outliers_report))){
  colnames(pca_outliers_report) =  c("dataset","PC","sample","score")
}

Sex checks

We verify that the sex of a sample can be predicted from the percent of chromosome Y and percent of chromosome X reads. This is done automatically by training a logistic regression classifier, but we also plot the 2D plot for each tissue.

# sex checks - using simple logistic regression classifier (LOO-CV)
sex_read_info = cbind(pipelineqc_scores$pct_chrX,pipelineqc_scores$pct_chrY)
rownames(sex_read_info) = rownames(pipelineqc_scores)
sex_check_outliers = c()
for(currname in names(tissue2Mmatrix)){
  # 1 is female
  pca_df = tissue2pca[[currname]]
  read_info = sex_read_info[rownames(pca_df),]
  df = data.frame(sex=as.factor(pca_df$sex),
                  pct_chrX = read_info[,1],
                  pct_chrY = read_info[,2],stringsAsFactors = F)
  p = ggplot(df) + 
      geom_point(aes(x=pct_chrX, y=pct_chrY,col=sex, shape=sex)) +
      ggtitle(paste0(currname," - sex check"))
  plot(p)
  train_control <- trainControl(method = "cv", number = nrow(df),
                                savePredictions = TRUE)
  # train the model on training set
  model <- train(sex ~ .,data = df,
               trControl = train_control,
               method = "glm", family=binomial(link="logit"))
  # CV results
  cv_res = model$pred
  cv_errors = cv_res[,1] != cv_res[,2]
  err_samples = rownames(df)[cv_res[cv_errors,"rowIndex"]]
  for(samp in err_samples){
    sex_check_outliers = rbind(sex_check_outliers,
                               c(currname,samp))
  }
}

if(! is.null(dim(sex_check_outliers))){
  colnames(sex_check_outliers) = c("dataset","sample")
}

Differential analysis

ftest_res = list() # keeps all ftest results
tpDA = list() # keep the timewise results for both males and females
for(dataset_name in names(rdata_files)){
  
  dea_res_file = paste0(dataset_name,"_rrbs_dea_results.RData")
  if (load_dea_results_from_rdata && file.exists(dea_res_file)){
    print(paste0("DEA results file exists: ",dea_res_file,", loading results and skipping computation"))
    load(dea_res_file)
    ftest_res[[dataset_name]] = ftest_results
    tpDA[[dataset_name]] = timewise_results
    print("done")
    #gc()
    next
  }
  
  print(paste("Analyzing dataset:",dataset_name))
  y = tissue2rrbs_data[[dataset_name]]
  is_sample = grepl("^9",colnames(y),perl=T)
  y = y[,is_sample]
  
  #remove outlier samples before DEA
  s_me=paste(samples_to_remove, "Me", sep="-")
  s_un=paste(samples_to_remove, "Un", sep="-")
  y <- y[ , ! colnames(y) %in% c(s_me,s_un)]
  
  curr_samps = sapply(colnames(y),function(x)strsplit(x,split="-")[[1]][1])
  #### Generate samples_to_remove per tissue list
  #get tissue code
  tcode = strsplit(dataset_name,split="-")[[1]][1]
  tcode = strsplit(tcode,split="t")[[1]][2]
  search_pattern = paste0(tcode,"[0-9]{2}$")
  outlier_samples = str_subset(samples_to_remove,search_pattern)
  if (length(outlier_samples) == 0){
    remove_samples = ""
  } else{
    remove_samples = paste(outlier_samples,collapse=",")
  }
  # Parse the covariates
  covs = data.frame(
    sample = as.factor(curr_samps),
    Me = grepl("Me",colnames(y)),
    sex = pheno_data[curr_samps,"registration.sex"],
    timepoint = pheno_data[curr_samps,"timepoint"],
    is_control = pheno_data[curr_samps,"is_control"],
    pct_unaligned_1 = poly(pipelineqc_scores[curr_samps,"pct_Unaligned"],1)[,1],
    pct_unaligned_2 = poly(pipelineqc_scores[curr_samps,"pct_Unaligned"],2)[,2]
  )
  covs$sex[covs$sex=="2"] = "M";covs$sex[covs$sex=="1"]="F"
  rownames(covs) = colnames(y)
  covs$group_tp=factor(paste(covs$timepoint,covs$is_control,sep="_"))
  
  ## Create input file to run the DEA on a cluster/GCP
  #save(y,covs,remove_samples,dataset_name,edger_tol,file=paste0(dataset_name,"_rrbs_dea_input.RData"))
  #next
  
  #split covariates to males and females
  covs_males = covs[covs$sex == "M", ] 
  covs_females = covs[covs$sex == "F", ]
  samples_males=rownames(covs_males)
  samples_females=rownames(covs_females)
  
  # subset y into males and females
  y_males = y[ ,samples_males]
  y_females = y[ ,samples_females]
  
  #create design matrices for for males and females separately
  # refactor the samples to have the correct levels
  covs_males$sample = as.factor(as.character(covs_males$sample))
  covs_females$sample = as.factor(as.character(covs_females$sample))
  
  # create design matrices for time-wise and F tests
  # (see https://support.bioconductor.org/p/12119/ and the limma guide) 
  #design matrix for males  
  samples_mat_male = model.matrix(~0+sample,data=covs_males)
  tp_mat_male = model.matrix(~0+group_tp,data=covs_males)
  #tp_mat_male = model.matrix(~0+group_tp+pct_unaligned_1+pct_unaligned_2,data=covs_males)
  tp_mat_male[!covs_males$Me,] = 0
  des_males = cbind(samples_mat_male,tp_mat_male)
  #make sure the rows of design matrix and columns of counts are identical
  if(sum(!row.names(des_males) == colnames(y_males$counts))>0){
    print(paste("ERROR in dataset",dataset_name,"check sample order before moving on"))
    break
  }
  
  #design matrix for females
  samples_mat_female = model.matrix(~0+sample,data=covs_females)
  tp_mat_female = model.matrix(~0+group_tp,data=covs_females)
  #tp_mat_female = model.matrix(~0+group_tp+pct_unaligned_1+pct_unaligned_2,data=covs_females)
  tp_mat_female[!covs_females$Me,] = 0
  des_females = cbind(samples_mat_female,tp_mat_female)
  if(sum(!row.names(des_females) == colnames(y_females$counts))>0){
    print(paste("ERROR in dataset",dataset_name,"check sample order before moving on"))
    break
  }
  
  full_model_str = "~0+sample+1me+group_me"
  null_model_str = "~0+sample+1me"
  
  # define the contrasts for males and females the analyses below
  C_ttests_female = makeContrasts(
  group_tp1_0 - group_tp8_1,group_tp2_0 - group_tp8_1,
  group_tp4_0 - group_tp8_1,group_tp8_0 - group_tp8_1,
  levels = des_females
  )
  
  C_ttests_male = makeContrasts(
    group_tp1_0 - group_tp8_1,group_tp2_0 - group_tp8_1,
    group_tp4_0 - group_tp8_1,group_tp8_0 - group_tp8_1,
    levels = des_males
  )
  
  #Estimate dispersions for males and females separately
  print("Estimating dispersion for the male design matrix...")
  y1_males <- estimateDisp(y_males, design=des_males,tol = edger_tol)
  print("Done")
  print("Running glmQLFit...")
  fit.ttest_males <- glmQLFit(y1_males,des_males)
  print("Done")
  
  print("Estimating dispersion for the female design matrix...")
  y1_females <- estimateDisp(y_females, design=des_females,tol = edger_tol)
  print("Done")
  print("Running glmQLFit...")
  fit.ttest_females <- glmQLFit(y1_females,des_females)
  print("Done")
  

  # extract contrast info
  print("Extracting timewise results from the contrasts...")
  #### male contrast
  tissue_tp_res = c()
  for(col in colnames(C_ttests_male)){
    sex_str = "male"
    curr_tp = strsplit(col,split="_")[[1]][2]
    curr_tp = strsplit(curr_tp,split="tp")[[1]][2]
    print(paste("Fitting timewise model for:",col))
    res <- glmQLFTest(fit.ttest_males,contrast=C_ttests_male[,col])
    title_name = paste0(dataset_name,"_M","_",curr_tp,"w vs 8w control")
    # plotMD(res,main = title_name)
    # extract the results into a table
    edger_res <- topTags(res, n=Inf, p.value = 1,
                         adjust.method = "BH",sort.by = "none")$table
    # add z-scores
    edger_res$F[edger_res$F < 0] = 1e-10
    t.stat <- sign(edger_res$logFC) * sqrt(edger_res$F)
    z <- zscoreT(t.stat, df=res$df.total)
    curr_res = data.frame(
      feature_ID = rownames(y_males),
      edger_res[,1:4],
      assay = "epigen-rrbs",
      tissue = dataset_name,
      removed_samples = remove_samples,
      sex = sex_str,
      #logFC_se = TBD,
      logFC = edger_res$logFC,
      fscore = edger_res$F,
      zscore = z,
      covariates = NA,
      comparison_group = paste0(curr_tp,"w"),
      p_value = edger_res$PValue,
      adj_p_value = edger_res$FDR
    )
    # add the results
    tissue_tp_res = rbind(tissue_tp_res,curr_res)
  }
  print("Done")
  ##### female contrast
  for(col in colnames(C_ttests_female)){
    sex_str = "female"
    curr_tp = strsplit(col,split="_")[[1]][2]
    curr_tp = strsplit(curr_tp,split="tp")[[1]][2]
    print(paste("Fitting timewise model for:",col))
    res <- glmQLFTest(fit.ttest_females,contrast=C_ttests_female[,col])
    title_name = paste0(dataset_name,"_F","_",curr_tp,"w vs 8w control")
    #plotMD(res,main = title_name)
    # extract the results into a table
    edger_res <- topTags(res, n=Inf, p.value = 1,
                         adjust.method = "BH",sort.by = "none")$table
    # add z-scores
    edger_res$F[edger_res$F < 0] = 1e-10
    t.stat <- sign(edger_res$logFC) * sqrt(edger_res$F)
    z <- zscoreT(t.stat, df=res$df.total)
    curr_res = data.frame(
      feature_ID = rownames(y_females),
      edger_res[,1:4],
      assay = "epigen-rrbs",
      tissue = dataset_name,
      removed_samples = remove_samples,
      #removed_samples = paste(samples_to_remove,collapse=","),
      sex = sex_str,
      #logFC_se = TBD,
      logFC = edger_res$logFC,
      fscore = edger_res$F,
      zscore = z,
      covariates = NA,
      comparison_group = paste0(curr_tp,"w"),
      p_value = edger_res$PValue,
      adj_p_value = edger_res$FDR
    )
    # add the results
    tissue_tp_res = rbind(tissue_tp_res,curr_res)
  }
  print("Done")
  # add results to larger list
  tpDA[[dataset_name]] = tissue_tp_res
  
  #males
  print("Fitting the model for the F-tests...")
  ftest_tp_mat_male = model.matrix(~1+group_tp,data=covs_males)
  #ftest_tp_mat_male = model.matrix(~1+pct_unaligned_1+pct_unaligned_2+group_tp,data=covs_males)
  ftest_tp_mat_male[!covs_males$Me,] = 0
  ftest_des_males = cbind(samples_mat_male,ftest_tp_mat_male)
  y2_males <- estimateDisp(y_males, design=ftest_des_males,tol = edger_tol)
  fit.ftest_males <- glmQLFit(y2_males,ftest_des_males)
  is_group_variable = grepl("group",colnames(ftest_des_males))
  res <- glmQLFTest(fit.ftest_males,coef=colnames(ftest_des_males)[is_group_variable])
  ftest_edger_res <- topTags(res, n=Inf, p.value = 1,
                      adjust.method = "BH",sort.by ="none")$table
  print("Done")
  
  # add the results
  curr_f_test_res_males = data.frame(
    feature_ID = rownames(y_males),
    ftest_edger_res[,1:4],
    assay = "epigen-rrbs",
    tissue = dataset_name,
    #sex = "male",
    p_value_male = ftest_edger_res$PValue,
    #adj_p_value = ftest_edger_res$FDR,
    fscore_male = ftest_edger_res$F,
    full_model = full_model_str,
    reduced_model = null_model_str
  )
  
  #females
  print("Fitting the model for the F-tests for females...")
  ftest_tp_mat_female = model.matrix(~1+group_tp,data=covs_females)
  #ftest_tp_mat_female = model.matrix(~1+pct_unaligned_1+pct_unaligned_2+group_tp,data=covs_females)
  ftest_tp_mat_female[!covs_females$Me,] = 0
  ftest_des_females = cbind(samples_mat_female,ftest_tp_mat_female)
  y2_females <- estimateDisp(y_females, design=ftest_des_females,tol = edger_tol)
  fit.ftest_females <- glmQLFit(y2_females,ftest_des_females)
  is_group_variable = grepl("group",colnames(ftest_des_females))
  res <- glmQLFTest(fit.ftest_females,coef=colnames(ftest_des_females)[is_group_variable])
  ftest_edger_res <- topTags(res, n=Inf, p.value = 1,
                      adjust.method = "BH",sort.by ="none")$table
  print("Done")
  
  curr_f_test_res_females = data.frame(
    feature_ID = rownames(y_females),
    ftest_edger_res[,1:4],
    assay = "epigen-rrbs",
    tissue = dataset_name,
    #sex = "female",
    p_value_female = ftest_edger_res$PValue,
    #adj_p_value = ftest_edger_res$FDR,
    fscore_female = ftest_edger_res$F,
    full_model = full_model_str,
    reduced_model = null_model_str
  )
  
  ###combine curr_f_test_res from males and females
  if(nrow(curr_f_test_res_females)!=nrow(curr_f_test_res_males)){
    print(paste("ERROR in dataset",dataset_name,"male and female ftest tables have different nrow"))
    break
  }
  if(any(curr_f_test_res_females$feature_ID != curr_f_test_res_males$feature_ID)){
    print(paste("ERROR in dataset",dataset_name,"male and female ftest tables have different row names"))
    break
  }
  common_col_names <- intersect(names(curr_f_test_res_males), names(curr_f_test_res_females))
  curr_f_test_res_combined = merge(curr_f_test_res_males,curr_f_test_res_females,
                                   by=common_col_names, all=TRUE)
  
  #Add sumlog p-value
  curr_f_test_res_combined$p_value = 
    apply(curr_f_test_res_combined[,c("p_value_male","p_value_female")],1,function(x)metap::sumlog(x)$p)
  
  # Add removed samples column
  curr_f_test_res_combined$removed_samples = remove_samples
  
  ### Add the curr_f_test_res_combined to existing ftest_res
  ftest_res[[dataset_name]]  = curr_f_test_res_combined
  print("Done")
  
  # free space, erase objects
  rm(edger_res);rm(ftest_edger_res)
  rm(des_males);rm(des_females)
  rm(y);rm(y1_males);rm(y1_females)
  gc()
}
gc()

Summarize the results

# TODO: implement this for lists
# ftest results
sumlog_counts = table(ftest_res$tissue,p.adjust(ftest_res$p_value,method="fdr")<0.1)
sumlog_counts_m = table(ftest_res$tissue,p.adjust(ftest_res$p_value_male,method="fdr")<0.1)
sumlog_counts_f = table(ftest_res$tissue,p.adjust(ftest_res$p_value_female,method="fdr")<0.1)
tp_counts = table(tpDA$tissue,tpDA$comparison_group,tpDA$adj_p_value < 0.1)
f_males = ftest_res$fscore_male
f_females = ftest_res$fscore_female
samp = sample(1:length(f_males))[1:10000]
f_m_rho = cor(f_males,f_females)
print("Num sig features per tissue using the sumlog test:")
print(sumlog_counts)
print("Num sig features, timewise:")
print(tp_counts[,,"TRUE"])
print("Male vs. female f-score correlation:")
print(f_m_rho)

A few plots: males vs. females:

# TODO: implement this for lists
randhist<-function(x,n=500000,...){
  x = x[sample(1:length(x))[1:n]]
  hist(x,...)
}
par(mfrow=c(3,3))
tissue = "t55-gastrocnemius"
hist(ftest_res[ftest_res$tissue==tissue,"p_value_male"],main="Gastroc, males f-test p-value")
hist(ftest_res[ftest_res$tissue==tissue,"p_value_female"],main="Gastroc, females f-test p-value")
hist(ftest_res[ftest_res$tissue==tissue,"p_value"],main="Gastroc, merged p-value")
cor(ftest_res[ftest_res$tissue==tissue,"fscore_female"],ftest_res[ftest_res$tissue==tissue,"fscore_male"])
tissue = "t58-heart"
hist(ftest_res[ftest_res$tissue==tissue,"p_value_male"],main="Heart, males f-test p-value")
hist(ftest_res[ftest_res$tissue==tissue,"p_value_female"],main="Heart, females f-test p-value")
hist(ftest_res[ftest_res$tissue==tissue,"p_value"],main="Heart, merged p-value")
cor(ftest_res[ftest_res$tissue==tissue,"fscore_female"],ftest_res[ftest_res$tissue==tissue,"fscore_male"])
tissue = "t69-brown-adipose"
hist(ftest_res[ftest_res$tissue==tissue,"p_value_male"],main="BAT, males f-test p-value")
hist(ftest_res[ftest_res$tissue==tissue,"p_value_female"],main="BAT, females f-test p-value")
hist(ftest_res[ftest_res$tissue==tissue,"p_value"],main="BAT, merged p-value")
cor(ftest_res[ftest_res$tissue==tissue,"fscore_female"],ftest_res[ftest_res$tissue==tissue,"fscore_male"])

Print the results to files:

out_bucket = dea_out_bucket
suffix = "20210820"

# remove trailing commas in the DEA tables, merge gene symbols and ids
for (tissue in names(ftest_res)){
  ftest_res[[tissue]]$Symbol = sapply(ftest_res[[tissue]]$Symbol,paste,collapse="")
  ftest_res[[tissue]]$Symbol = gsub(",$","", ftest_res[[tissue]]$Symbol)
  ftest_res[[tissue]]$EntrezID = sapply(ftest_res[[tissue]]$EntrezID,paste,collapse="")
  ftest_res[[tissue]]$EntrezID = gsub(",$","", ftest_res[[tissue]]$EntrezID)
  
  tpDA[[tissue]]$Symbol = sapply(tpDA[[tissue]]$Symbol,paste,collapse="")
  tpDA[[tissue]]$Symbol = gsub(",$","", tpDA[[tissue]]$Symbol)
  tpDA[[tissue]]$EntrezID = sapply(tpDA[[tissue]]$EntrezID,paste,collapse="")
  tpDA[[tissue]]$EntrezID = gsub(",$","", tpDA[[tissue]]$EntrezID)
}

# timewise tables - this is extremely slow for large datasets
# # use RData for timewise instead
# for(tissue in names(tpDA)){
#   currDA = tpDA[[tissue]]
#   local_fname = paste0(local_data_dir,
#                        "pass1b-06_",tissue,"_epigen-rrbs_timewise-dea_",suffix)
#   fwrite(currDA,file=local_fname,sep="\t",quote=F,
#               row.names = F,col.names = T)
#   cmd = paste(gsutil_cmd,"-m cp",local_fname,out_bucket)
#   system(cmd)
#   system(paste("rm",local_fname))
# }

# use RData for timewise instead
for(tissue in names(tpDA)){
  timewiseDA = tpDA[[tissue]]
  local_fname = paste0(local_data_dir,
                       "pass1b-06_",tissue,"_epigen-rrbs_timewise-dea_",suffix,".RData")
  save(timewiseDA,file=local_fname)
  cmd = paste(gsutil_cmd,"-m cp",local_fname,out_bucket)
  system(cmd)
  system(paste("rm",local_fname))
}

# f-test tables
for(tissue in names(ftest_res)){
  currTable = ftest_res[[tissue]]
  local_fname = paste0(local_data_dir,
                       "pass1b-06_",tissue,"_epigen-rrbs_training-dea_",suffix,".txt")
  fwrite(currTable,file=local_fname,sep="\t",quote=F,
              row.names = F,col.names = T)
  cmd = paste(gsutil_cmd,"-m cp",local_fname,out_bucket)
  system(cmd)
  system(paste("rm",local_fname))
}